{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "USERS: 6040 ITEMS: 3706\n"
     ]
    }
   ],
   "source": [
    "# 加载数据\n",
    "df = pd.read_csv('ratings.dat', sep='\\t', names=['user', 'item', 'rating', 'timestamp'], header=None)\n",
    "df = df.drop('timestamp', axis=1)\n",
    "\n",
    "num_items = df.item.nunique()\n",
    "num_users = df.user.nunique()\n",
    "\n",
    "print(\"USERS: {} ITEMS: {}\".format(num_users, num_items))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 对输入做Normalization\n",
    "from sklearn import preprocessing\n",
    "r = df['rating'].values.astype(float)\n",
    "min_max_scaler = preprocessing.MinMaxScaler()\n",
    "x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))\n",
    "df_normalized = pd.DataFrame(x_scaled)\n",
    "df['rating'] = df_normalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 把DataFrame转成user-item矩阵\n",
    "matrix = df.pivot(index='user', columns='item', values='rating')\n",
    "matrix.fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "users = matrix.index.tolist()\n",
    "items = matrix.columns.tolist()\n",
    "matrix = matrix.as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 网络超参数\n",
    "num_input = num_items\n",
    "num_hidden_1 = 10\n",
    "num_hidden_2 = 5\n",
    "\n",
    "X = tf.placeholder(tf.float64, [None, num_input])\n",
    "\n",
    "# 隐层的变量初始化\n",
    "weights = {\n",
    "    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),\n",
    "    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),\n",
    "    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),\n",
    "    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),\n",
    "}\n",
    "\n",
    "biases = {\n",
    "    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),\n",
    "    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),\n",
    "    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),\n",
    "    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 构建encoder\n",
    "def encoder(x):\n",
    "    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))\n",
    "    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))\n",
    "    return layer_2\n",
    "\n",
    "\n",
    "# 构建decoder\n",
    "def decoder(x):\n",
    "    layer_1 = tf.nn.relu(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))\n",
    "    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))\n",
    "    return layer_2\n",
    "\n",
    "\n",
    "# 构建整个模型\n",
    "encoder_op = encoder(X)\n",
    "decoder_op = decoder(encoder_op)\n",
    "\n",
    "\n",
    "# 预测\n",
    "y_pred = decoder_op\n",
    "\n",
    "\n",
    "# 标准答案就是输入\n",
    "y_true = X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 定义损失函数和优化器，最小化square error\n",
    "loss = tf.losses.mean_squared_error(y_true, y_pred)\n",
    "optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)\n",
    "\n",
    "predictions = pd.DataFrame()\n",
    "\n",
    "# 定义评估准则\n",
    "eval_x = tf.placeholder(tf.int32, )\n",
    "eval_y = tf.placeholder(tf.int32, )\n",
    "pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 变量初始化\n",
    "init = tf.global_variables_initializer()\n",
    "local_init = tf.local_variables_initializer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch: 1 Loss: 62.36632331212362\n",
      "Epoch: 2 Loss: 2.8954603796203933\n",
      "Epoch: 3 Loss: 0.8328991259137789\n",
      "Epoch: 4 Loss: 0.5103750303387642\n",
      "Epoch: 5 Loss: 0.40919852380951244\n",
      "Epoch: 6 Loss: 0.2719353350500266\n",
      "Epoch: 7 Loss: 0.11627365959187348\n",
      "Epoch: 8 Loss: 0.04368581133894622\n",
      "Epoch: 9 Loss: 0.022539181790004175\n",
      "Epoch: 10 Loss: 0.02073039785803606\n",
      "Epoch: 11 Loss: 0.020577685985093314\n",
      "Epoch: 12 Loss: 0.02057870877130578\n",
      "Epoch: 13 Loss: 0.020559902186505497\n",
      "Epoch: 14 Loss: 0.02056138403713703\n",
      "Epoch: 15 Loss: 0.02056206832639873\n",
      "Epoch: 16 Loss: 0.020555239481230576\n",
      "Epoch: 17 Loss: 0.020553683435233932\n",
      "Epoch: 18 Loss: 0.020552304107695818\n",
      "Epoch: 19 Loss: 0.020546471234411\n",
      "Epoch: 20 Loss: 0.020553408190608025\n",
      "Epoch: 21 Loss: 0.020547243805291753\n",
      "Epoch: 22 Loss: 0.020547459794518847\n",
      "Epoch: 23 Loss: 0.02054187764103214\n",
      "Epoch: 24 Loss: 0.020541465530792873\n",
      "Epoch: 25 Loss: 0.020549740370673437\n",
      "Epoch: 26 Loss: 0.02054038519660632\n",
      "Epoch: 27 Loss: 0.020527764533956844\n",
      "Epoch: 28 Loss: 0.020551347134945292\n",
      "Epoch: 29 Loss: 0.020533538113037746\n",
      "Epoch: 30 Loss: 0.020538544942003984\n",
      "Epoch: 31 Loss: 0.020533038458476465\n",
      "Epoch: 32 Loss: 0.020530214766040444\n",
      "Epoch: 33 Loss: 0.02053276961669326\n",
      "Epoch: 34 Loss: 0.02052874370322873\n",
      "Epoch: 35 Loss: 0.020529168851984043\n",
      "Epoch: 36 Loss: 0.020527750486508012\n",
      "Epoch: 37 Loss: 0.02052694302983582\n",
      "Epoch: 38 Loss: 0.020523735593693953\n",
      "Epoch: 39 Loss: 0.02052681245064984\n",
      "Epoch: 40 Loss: 0.020524489615733426\n",
      "Epoch: 41 Loss: 0.020525843758756917\n",
      "Epoch: 42 Loss: 0.020522404306878645\n",
      "Epoch: 43 Loss: 0.020519051623220246\n",
      "Epoch: 44 Loss: 0.020518526046847303\n",
      "Epoch: 45 Loss: 0.020522128829422098\n",
      "Epoch: 46 Loss: 0.020515818420487147\n",
      "Epoch: 47 Loss: 0.020517294527962804\n",
      "Epoch: 48 Loss: 0.020511170228322346\n",
      "Epoch: 49 Loss: 0.020515617177200813\n",
      "Epoch: 50 Loss: 0.02051336659739415\n",
      "Epoch: 51 Loss: 0.020509604597464204\n",
      "Epoch: 52 Loss: 0.02051199018023908\n",
      "Epoch: 53 Loss: 0.020507035117285948\n",
      "Epoch: 54 Loss: 0.02050953928846866\n",
      "Epoch: 55 Loss: 0.020508303306996822\n",
      "Epoch: 56 Loss: 0.02050593534174065\n",
      "Epoch: 57 Loss: 0.02050364820752293\n",
      "Epoch: 58 Loss: 0.020501573841708403\n",
      "Epoch: 59 Loss: 0.02050897719648977\n",
      "Epoch: 60 Loss: 0.020498287087927263\n",
      "Epoch: 61 Loss: 0.02050747217920919\n",
      "Epoch: 62 Loss: 0.020511943575305242\n",
      "Epoch: 63 Loss: 0.020500182174146175\n",
      "Epoch: 64 Loss: 0.02049516242307921\n",
      "Epoch: 65 Loss: 0.020505099402119715\n",
      "Epoch: 66 Loss: 0.020497227553278208\n",
      "Epoch: 67 Loss: 0.02050105625918756\n",
      "Epoch: 68 Loss: 0.020496489324917395\n",
      "Epoch: 69 Loss: 0.020500448260766763\n",
      "Epoch: 70 Loss: 0.020499490938770275\n",
      "Epoch: 71 Loss: 0.020495102818434436\n",
      "Epoch: 72 Loss: 0.02049667143728584\n",
      "Epoch: 73 Loss: 0.020495163082766037\n",
      "Epoch: 74 Loss: 0.020493092442241807\n",
      "Epoch: 75 Loss: 0.020489083564219374\n",
      "Epoch: 76 Loss: 0.020491546175132196\n",
      "Epoch: 77 Loss: 0.020490625174716115\n",
      "Epoch: 78 Loss: 0.020490886565918725\n",
      "Epoch: 79 Loss: 0.020492181880399585\n",
      "Epoch: 80 Loss: 0.020488501565220456\n",
      "Epoch: 81 Loss: 0.020491315556379657\n",
      "Epoch: 82 Loss: 0.02048771025147289\n",
      "Epoch: 83 Loss: 0.020488823919246595\n",
      "Epoch: 84 Loss: 0.020487341564148664\n",
      "Epoch: 85 Loss: 0.020486407758047182\n",
      "Epoch: 86 Loss: 0.020488536373401683\n",
      "Epoch: 87 Loss: 0.020486171628969412\n",
      "Epoch: 88 Loss: 0.02048757191126545\n",
      "Epoch: 89 Loss: 0.020485026102202635\n",
      "Epoch: 90 Loss: 0.02048375647670279\n",
      "Epoch: 91 Loss: 0.020482595815944176\n",
      "Epoch: 92 Loss: 0.020486746410218377\n",
      "Epoch: 93 Loss: 0.020485238172113895\n",
      "Epoch: 94 Loss: 0.020484034166050453\n",
      "Epoch: 95 Loss: 0.02048404107335955\n",
      "Epoch: 96 Loss: 0.02048494612487654\n",
      "Epoch: 97 Loss: 0.020483742584474385\n",
      "Epoch: 98 Loss: 0.020482821234812338\n",
      "Epoch: 99 Loss: 0.020477980685730774\n",
      "Epoch: 100 Loss: 0.02047935881031056\n",
      "Predictions...\n"
     ]
    }
   ],
   "source": [
    "# 在session中run\n",
    "with tf.Session() as session:\n",
    "    epochs = 100\n",
    "    batch_size = 250\n",
    "\n",
    "    session.run(init)\n",
    "    session.run(local_init)\n",
    "\n",
    "    num_batches = int(matrix.shape[0] / batch_size)\n",
    "    matrix = np.array_split(matrix, num_batches)\n",
    "\n",
    "    for i in range(epochs):\n",
    "\n",
    "        avg_cost = 0\n",
    "\n",
    "        for batch in matrix:\n",
    "            _, l = session.run([optimizer, loss], feed_dict={X: batch})\n",
    "            avg_cost += l\n",
    "\n",
    "        avg_cost /= num_batches\n",
    "\n",
    "        print(\"Epoch: {} Loss: {}\".format(i + 1, avg_cost))\n",
    "\n",
    "    print(\"Predictions...\")\n",
    "\n",
    "    matrix = np.concatenate(matrix, axis=0)\n",
    "\n",
    "    preds = session.run(decoder_op, feed_dict={X: matrix})\n",
    "\n",
    "    predictions = predictions.append(pd.DataFrame(preds))\n",
    "\n",
    "    predictions = predictions.stack().reset_index(name='rating')\n",
    "    predictions.columns = ['user', 'item', 'rating']\n",
    "    predictions['user'] = predictions['user'].map(lambda value: users[value])\n",
    "    predictions['item'] = predictions['item'].map(lambda value: items[value])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "运行很耗时，文件已生成到当前目录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 为每个用户计算top10的推荐\n",
    "print(\"Filtering out items in training set\")\n",
    "keys = ['user', 'item']\n",
    "i1 = predictions.set_index(keys).index\n",
    "i2 = df.set_index(keys).index\n",
    "\n",
    "recs = predictions[~i1.isin(i2)]\n",
    "recs = recs.sort_values(['user', 'rating'], ascending=[True, False])\n",
    "recs = recs.groupby('user').head(10)\n",
    "recs.to_csv('recs.tsv', sep='\\t', index=False, header=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
